1.5. Housing in Brazil 🇧🇷
# Import Matplotlib, pandas, and plotly
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
Task 1.5.1
df1 = pd.read_csv("data/brasil-real-estate-1.csv")
df1.head()
| property_type | place_with_parent_names | region | lat-lon | area_m2 | price_usd | |
|---|---|---|---|---|---|---|
| 0 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.6443051,-35.7088142 | 110.0 | $187,230.85 |
| 1 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.6430934,-35.70484 | 65.0 | $81,133.37 |
| 2 | house | |Brasil|Alagoas|Maceió| | Northeast | -9.6227033,-35.7297953 | 211.0 | $154,465.45 |
| 3 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.622837,-35.719556 | 99.0 | $146,013.20 |
| 4 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.654955,-35.700227 | 55.0 | $101,416.71 |
Task 1.5.2
df1.dropna(inplace=True)
df1.head()
| property_type | place_with_parent_names | region | lat-lon | area_m2 | price_usd | |
|---|---|---|---|---|---|---|
| 0 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.6443051,-35.7088142 | 110.0 | $187,230.85 |
| 1 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.6430934,-35.70484 | 65.0 | $81,133.37 |
| 2 | house | |Brasil|Alagoas|Maceió| | Northeast | -9.6227033,-35.7297953 | 211.0 | $154,465.45 |
| 3 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.622837,-35.719556 | 99.0 | $146,013.20 |
| 4 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.654955,-35.700227 | 55.0 | $101,416.71 |
Task 1.5.3
df1[["lat", "lon"]] = df1["lat-lon"].str.split(",", expand=True)
# Convert them to float
df1["lat"] = df1["lat"].astype(float)
df1["lon"] = df1["lon"].astype(float)
# Check result
df1.head()
| property_type | place_with_parent_names | region | lat-lon | area_m2 | price_usd | lat | lon | |
|---|---|---|---|---|---|---|---|---|
| 0 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.6443051,-35.7088142 | 110.0 | $187,230.85 | -9.644305 | -35.708814 |
| 1 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.6430934,-35.70484 | 65.0 | $81,133.37 | -9.643093 | -35.704840 |
| 2 | house | |Brasil|Alagoas|Maceió| | Northeast | -9.6227033,-35.7297953 | 211.0 | $154,465.45 | -9.622703 | -35.729795 |
| 3 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.622837,-35.719556 | 99.0 | $146,013.20 | -9.622837 | -35.719556 |
| 4 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.654955,-35.700227 | 55.0 | $101,416.71 | -9.654955 | -35.700227 |
Task 1.5.4
df1["state"] = df1["place_with_parent_names"].str.split("|").str[2]
# Check result
df1[["place_with_parent_names", "state"]].head()
| place_with_parent_names | state | |
|---|---|---|
| 0 | |Brasil|Alagoas|Maceió| | Alagoas |
| 1 | |Brasil|Alagoas|Maceió| | Alagoas |
| 2 | |Brasil|Alagoas|Maceió| | Alagoas |
| 3 | |Brasil|Alagoas|Maceió| | Alagoas |
| 4 | |Brasil|Alagoas|Maceió| | Alagoas |
Task 1.5.5
df1["price_usd"] = (
df1["price_usd"]
.astype(str) # ensure strings
.str.replace(r"[$,]", "", regex=True) # remove $ and ,
.astype(float) # convert to float
)
df1.head()
| property_type | place_with_parent_names | region | lat-lon | area_m2 | price_usd | lat | lon | state | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.6443051,-35.7088142 | 110.0 | 187230.85 | -9.644305 | -35.708814 | Alagoas |
| 1 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.6430934,-35.70484 | 65.0 | 81133.37 | -9.643093 | -35.704840 | Alagoas |
| 2 | house | |Brasil|Alagoas|Maceió| | Northeast | -9.6227033,-35.7297953 | 211.0 | 154465.45 | -9.622703 | -35.729795 | Alagoas |
| 3 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.622837,-35.719556 | 99.0 | 146013.20 | -9.622837 | -35.719556 | Alagoas |
| 4 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.654955,-35.700227 | 55.0 | 101416.71 | -9.654955 | -35.700227 | Alagoas |
Task 1.5.6
df1.drop(columns=["place_with_parent_names"], inplace=True)
df1.head()
| property_type | region | lat-lon | area_m2 | price_usd | lat | lon | state | |
|---|---|---|---|---|---|---|---|---|
| 0 | apartment | Northeast | -9.6443051,-35.7088142 | 110.0 | 187230.85 | -9.644305 | -35.708814 | Alagoas |
| 1 | apartment | Northeast | -9.6430934,-35.70484 | 65.0 | 81133.37 | -9.643093 | -35.704840 | Alagoas |
| 2 | house | Northeast | -9.6227033,-35.7297953 | 211.0 | 154465.45 | -9.622703 | -35.729795 | Alagoas |
| 3 | apartment | Northeast | -9.622837,-35.719556 | 99.0 | 146013.20 | -9.622837 | -35.719556 | Alagoas |
| 4 | apartment | Northeast | -9.654955,-35.700227 | 55.0 | 101416.71 | -9.654955 | -35.700227 | Alagoas |
Task 1.5.7
df2 = pd.read_csv("data/brasil-real-estate-2.csv")
df2.head()
| property_type | state | region | lat | lon | area_m2 | price_brl | |
|---|---|---|---|---|---|---|---|
| 0 | apartment | Pernambuco | Northeast | -8.134204 | -34.906326 | 72.0 | 414222.98 |
| 1 | apartment | Pernambuco | Northeast | -8.126664 | -34.903924 | 136.0 | 848408.53 |
| 2 | apartment | Pernambuco | Northeast | -8.125550 | -34.907601 | 75.0 | 299438.28 |
| 3 | apartment | Pernambuco | Northeast | -8.120249 | -34.895920 | 187.0 | 848408.53 |
| 4 | apartment | Pernambuco | Northeast | -8.142666 | -34.906906 | 80.0 | 464129.36 |
Task 1.5.8
df2["price_usd"] = (df2["price_brl"] / 3.19).round(2)
df2.head()
| property_type | state | region | lat | lon | area_m2 | price_brl | price_usd | |
|---|---|---|---|---|---|---|---|---|
| 0 | apartment | Pernambuco | Northeast | -8.134204 | -34.906326 | 72.0 | 414222.98 | 129850.46 |
| 1 | apartment | Pernambuco | Northeast | -8.126664 | -34.903924 | 136.0 | 848408.53 | 265958.79 |
| 2 | apartment | Pernambuco | Northeast | -8.125550 | -34.907601 | 75.0 | 299438.28 | 93867.80 |
| 3 | apartment | Pernambuco | Northeast | -8.120249 | -34.895920 | 187.0 | 848408.53 | 265958.79 |
| 4 | apartment | Pernambuco | Northeast | -8.142666 | -34.906906 | 80.0 | 464129.36 | 145495.10 |
Task 1.5.9
df2.drop(columns=["price_brl"], inplace=True)
df2.dropna(inplace=True)
df2.head()
| property_type | state | region | lat | lon | area_m2 | price_usd | |
|---|---|---|---|---|---|---|---|
| 0 | apartment | Pernambuco | Northeast | -8.134204 | -34.906326 | 72.0 | 129850.46 |
| 1 | apartment | Pernambuco | Northeast | -8.126664 | -34.903924 | 136.0 | 265958.79 |
| 2 | apartment | Pernambuco | Northeast | -8.125550 | -34.907601 | 75.0 | 93867.80 |
| 3 | apartment | Pernambuco | Northeast | -8.120249 | -34.895920 | 187.0 | 265958.79 |
| 4 | apartment | Pernambuco | Northeast | -8.142666 | -34.906906 | 80.0 | 145495.10 |
Task 1.5.10
df = pd.concat([df1, df2])
print("df shape:", df.shape)
df.to_csv("data/brasil-real-estate-clean.csv", index=False)
df.head()
df shape: (22844, 8)
| property_type | region | lat-lon | area_m2 | price_usd | lat | lon | state | |
|---|---|---|---|---|---|---|---|---|
| 0 | apartment | Northeast | -9.6443051,-35.7088142 | 110.0 | 187230.85 | -9.644305 | -35.708814 | Alagoas |
| 1 | apartment | Northeast | -9.6430934,-35.70484 | 65.0 | 81133.37 | -9.643093 | -35.704840 | Alagoas |
| 2 | house | Northeast | -9.6227033,-35.7297953 | 211.0 | 154465.45 | -9.622703 | -35.729795 | Alagoas |
| 3 | apartment | Northeast | -9.622837,-35.719556 | 99.0 | 146013.20 | -9.622837 | -35.719556 | Alagoas |
| 4 | apartment | Northeast | -9.654955,-35.700227 | 55.0 | 101416.71 | -9.654955 | -35.700227 | Alagoas |
Explore¶
fig = px.scatter_mapbox(
df,
lat="lat",
lon="lon",
center={"lat": -14.2, "lon": -51.9}, # Map will be centered on Brazil
width=600,
height=600,
hover_data=["price_usd"], # Display price when hovering mouse over house
)
fig.update_layout(mapbox_style="open-street-map")
fig.show()
Task 1.5.11
summary_stats = df[["area_m2", "price_usd"]].describe()
summary_stats
| area_m2 | price_usd | |
|---|---|---|
| count | 22844.000000 | 22844.000000 |
| mean | 115.020224 | 194987.315515 |
| std | 47.742932 | 103617.682979 |
| min | 53.000000 | 74892.340000 |
| 25% | 76.000000 | 113898.770000 |
| 50% | 103.000000 | 165697.555000 |
| 75% | 142.000000 | 246900.882500 |
| max | 252.000000 | 525659.720000 |
In the following task, you'll notice a small change in how plots are created compared to what you saw in the lessons.
While the lessons use the global matplotlib method like plt.plot(...), in this task, you are expected to use the object-oriented (OOP) API instead.
This means creating your plots using fig, ax = plt.subplots() and then calling plotting methods on the ax object, such as ax.plot(...), ax.hist(...), or ax.scatter(...).
If you're using pandas’ or seaborn’s built-in plotting methods (like df.plot() or sns.lineplot()), make sure to pass the ax=ax argument so that the plot is rendered on the correct axes.
This approach is considered best practice and will be used consistently across all graded tasks that involve matplotlib.
Task 1.5.12
# Don't change the code below 👇
fig, ax = plt.subplots()
prices = df["price_usd"].head(20000).dropna()
# Build histogram
ax.hist(prices)
# Label axes
plt.xlabel("Price [USD]")
plt.ylabel("Frequency")
# Add title
plt.title("Distribution of Home Prices")
Text(0.5, 1.0, 'Distribution of Home Prices')
Task 1.5.13
# Don't change the code below 👇
fig, ax = plt.subplots()
#Build box plot
ax.boxplot(df["area_m2"], vert=False)
# Label x-axis
plt.xlabel("Area [sq meters]")
# Add title
plt.title("Distribution of Home Sizes")
Text(0.5, 1.0, 'Distribution of Home Sizes')
Task 1.5.14
mean_price_by_region = df.groupby("region")["price_usd"].mean()
mean_price_by_region = mean_price_by_region.sort_values()
mean_price_by_region
region Central-West 178596.283663 North 181308.958188 Northeast 185422.985482 South 189012.345360 Southeast 208996.762761 Name: price_usd, dtype: float64
Task 1.5.15
# Don't change the code below 👇
fig, ax = plt.subplots()
# Build bar chart, label axes, add title
mean_price_by_region.plot(
kind='bar',
xlabel="Region",
ylabel="Mean Price [USD]",
title="Mean Home Price by Region"
, ax=ax)
<Axes: title={'center': 'Mean Home Price by Region'}, xlabel='Region', ylabel='Mean Price [USD]'>
Task 1.5.16
df.drop(columns="lat-lon", inplace=True)
df_south = df[df["region"] == "South"]
df_south.head()
| property_type | region | area_m2 | price_usd | lat | lon | state | |
|---|---|---|---|---|---|---|---|
| 9304 | apartment | South | 127.0 | 296448.85 | -25.455704 | -49.292918 | Paraná |
| 9305 | apartment | South | 104.0 | 219996.25 | -25.455704 | -49.292918 | Paraná |
| 9306 | apartment | South | 100.0 | 194210.50 | -25.460236 | -49.293812 | Paraná |
| 9307 | apartment | South | 77.0 | 149252.94 | -25.460236 | -49.293812 | Paraná |
| 9308 | apartment | South | 73.0 | 144167.75 | -25.460236 | -49.293812 | Paraná |
Task 1.5.17
homes_by_state = df_south["state"].value_counts()
homes_by_state
Rio Grande do Sul 2643 Santa Catarina 2634 Paraná 2544 Name: state, dtype: int64
Task 1.5.18
# Subset data
df_south_rgs = df_south["state"].value_counts().idxmax()
# Don't change the code below 👇
fig, ax = plt.subplots()
df_top_state = df_south[df_south["state"] == df_south_rgs]
# Build scatter plot
ax.scatter(x=df_top_state["area_m2"], y=df_top_state["price_usd"])
# Label axes
plt.xlabel("Area [sq meters]")
plt.ylabel("Price [USD]")
# Add title
plt.title(f"{df_south_rgs}: Price vs. Area")
Text(0.5, 1.0, 'Rio Grande do Sul: Price vs. Area')
Task 1.5.19
# First, filter only South region data
df_south = df[df["region"] == "South"]
# Create the dictionary with correlations
south_states_corr = {}
# Loop through the three states
for state in df_south["state"].unique():
state_data = df_south[df_south["state"] == state]
corr = state_data["area_m2"].corr(state_data["price_usd"])
south_states_corr[state] = corr
south_states_corr
{'Paraná': 0.5436659935502657,
'Rio Grande do Sul': 0.5773267433871904,
'Santa Catarina': 0.5068121769989854}
Copyright 2024 WorldQuant University. This content is licensed solely for personal use. Redistribution or publication of this material is strictly prohibited.